import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
from gensim.utils import simple_preprocess
import operator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp
import itertools
from sklearn.model_selection import learning_curve
import os
import warnings
from os.path import isfile, join
from os import listdir
import seaborn as sns
import re
import time
import sys
locate_python = sys.exec_prefix
from sklearn.model_selection import StratifiedShuffleSplit


def get_new_text(data, spread, char):
	dict_list = []
	dict_list_repeat = []
	bool_dict = {}
	num_tot_list = []
	if char == "adjustability":
		# spread = 2 is best for adjust
		# data['basic'] = 0
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []

			for j, word in enumerate(text_block):
				if ("adjust" in word) & ("adjusting" not in word):
					# data.loc[i,'basic'] = 1
					counts += 1
					# print counts
					if counts > 1:
						new_dict2 = {}
						word_list2 = []
						num_list2 = []
					# print j, word
					scope_neg = j - spread
					scope_neg_diff = spread + scope_neg
					scope_upper = length - j
					if (scope_neg < 0):
						scope = scope_neg_diff
					elif (scope_upper < spread):
						scope = scope_upper
					else:
						scope = spread
					for k in range(scope):
						if counts > 1:
							new_dict2[text_block[j + k]] = k
							new_dict2[text_block[j - k]] = -k
						else:
							new_dict[text_block[j + k]] = k
							new_dict[text_block[j - k]] = -k
					if counts > 1:
						new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
						word_list2.append(' '.join([l[0] for l in new_dict2]))
						num_list2.append([l[1] for l in new_dict2])
						if word_list is str:
							word_list = [word_list]
							word_list.append(word_list2)
						else:
							word_list.append(word_list2[0])
						num_list.append(num_list2[0])

					else:
						new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
						word_list.append(' '.join([l[0] for l in new_dict]))
						num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])

			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat
	elif char == "comfort":
		# data['basic'] = 0
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []
			class_is = int(data['class'].values[i])
			for j, word in enumerate(text_block):
				if (
					('unpleas' in word) | ('circulation' == word) | ('noise' == word) | ('noisy' == word)
					| ('noiseless' == word) | ('noiselessly' == word) | ('perspiration' == word)
					| ('comfort' in word) | ('rattle' == word) | ('soft' == word) | ('ventila' in word)
					| ('pain' == word) | ('painful' == word) | ('chafe' == word) | ('chafing' == word)
					| ('odor' == word) | ('offensive' == word)
						| ('rattling' == word) | ('rattles' == word) | ('clicking' == word) | ('creak' == word) | ('creaking' in word)):
					if (class_is != 623):
						if ("soft" == word):
							None
						else:
							# data.loc[i,'basic'] = 1
							counts += 1
							# print counts
							if counts > 1:
								new_dict2 = {}
								word_list2 = []
								num_list2 = []
							# print j, word
							scope_neg = j - spread
							scope_neg_diff = spread + scope_neg
							scope_upper = length - j
							if (scope_neg < 0):
								scope = scope_neg_diff
							elif (scope_upper < spread):
								scope = scope_upper
							else:
								scope = spread
							for k in range(scope):
								if counts > 1:
									new_dict2[text_block[j + k]] = k
									new_dict2[text_block[j - k]] = -k
								else:
									new_dict[text_block[j + k]] = k
									new_dict[text_block[j - k]] = -k
							if counts > 1:
								new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
								word_list2.append(' '.join([l[0] for l in new_dict2]))
								num_list2.append([l[1] for l in new_dict2])
								if word_list is str:
									word_list = [word_list]
									word_list.append(word_list2)
								else:
									word_list.append(word_list2[0])
								num_list.append(num_list2[0])

							else:
								new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
								word_list.append(' '.join([l[0] for l in new_dict]))
								num_list.append([l[1] for l in new_dict])

					else:
						# data.loc[i,'basic'] = 1
						counts += 1
						# print counts
						if counts > 1:
							new_dict2 = {}
							word_list2 = []
							num_list2 = []
						# print j, word
						scope_neg = j - spread
						scope_neg_diff = spread + scope_neg
						scope_upper = length - j
						if (scope_neg < 0):
							scope = scope_neg_diff
						elif (scope_upper < spread):
							scope = scope_upper
						else:
							scope = spread
						for k in range(scope):
							if counts > 1:
								new_dict2[text_block[j + k]] = k
								new_dict2[text_block[j - k]] = -k
							else:
								new_dict[text_block[j + k]] = k
								new_dict[text_block[j - k]] = -k
						if counts > 1:
							new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
							word_list2.append(' '.join([l[0] for l in new_dict2]))
							num_list2.append([l[1] for l in new_dict2])
							if word_list is str:
								word_list = [word_list]
								word_list.append(word_list2)
							else:
								word_list.append(word_list2[0])
							num_list.append(num_list2[0])

						else:
							new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
							word_list.append(' '.join([l[0] for l in new_dict]))
							num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])
			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat

	elif char == "simplicity":
		# spread = 3 is best for simplicity
		# data['basic'] = 0
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []
			for j, word in enumerate(text_block):
				if ("simple" in word) or ("simpli" in word) or ("simplif" in word) or ('difficult' == word) or (word == 'complex')\
								or ('complicat' in word) or (word == 'simplicity'):
					counts += 1
					# print counts
					if counts > 1:
						new_dict2 = {}
						word_list2 = []
						num_list2 = []
					# print j, word
					scope_neg = j - spread
					scope_neg_diff = spread + scope_neg
					scope_upper = length - j
					if (scope_neg < 0):
						scope = scope_neg_diff
					elif (scope_upper < spread):
						scope = scope_upper
					else:
						scope = spread
					for k in range(scope):
						if counts > 1:
							new_dict2[text_block[j + k]] = k
							new_dict2[text_block[j - k]] = -k
						else:
							new_dict[text_block[j + k]] = k
							new_dict[text_block[j - k]] = -k
					if counts > 1:
						new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
						word_list2.append(' '.join([l[0] for l in new_dict2]))
						num_list2.append([l[1] for l in new_dict2])
						if word_list is str:
							word_list = [word_list]
							word_list.append(word_list2)
						else:
							word_list.append(word_list2[0])
						num_list.append(num_list2[0])

					else:
						new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
						word_list.append(' '.join([l[0] for l in new_dict]))
						num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])
			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat


	elif char == "durability":
		# spread = 1 is best for cost
		# data['basic'] = 0
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []
			for j, word in enumerate(text_block):
				if ('durabl' in word) | ('durability' in word) | ('strength' in word) | ('strengthen' in word) \
				| ('strong' == word) | ('dirt' in word) | ('waterproof' in word) \
				| ('friction' in word) | ('preserv' in word)\
				| ('break' == word) | ('rot' == word) | ('tougher' == word) | ('leakage' == word) | ('leak' == word)\
					| ('corrosion' == word) | ('corrosive' == word) | ('burst' == word) | ('weak' == word):
					# data.loc[i,'basic'] = 1
					counts += 1
					# print counts
					if counts > 1:
						new_dict2 = {}
						word_list2 = []
						num_list2 = []
					# print j, word
					scope_neg = j - spread
					scope_neg_diff = spread + scope_neg
					scope_upper = length - j
					if (scope_neg < 0):
						scope = scope_neg_diff
					elif (scope_upper < spread):
						scope = scope_upper
					else:
						scope = spread
					for k in range(scope):
						if counts > 1:
							new_dict2[text_block[j + k]] = k
							new_dict2[text_block[j - k]] = -k
						else:
							new_dict[text_block[j + k]] = k
							new_dict[text_block[j - k]] = -k
					if counts > 1:
						new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
						word_list2.append(' '.join([l[0] for l in new_dict2]))
						num_list2.append([l[1] for l in new_dict2])
						if word_list is str:
							word_list = [word_list]
							word_list.append(word_list2)
						else:
							word_list.append(word_list2[0])
						num_list.append(num_list2[0])

					else:
						new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
						word_list.append(' '.join([l[0] for l in new_dict]))
						num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])
			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat

	elif char == "appearance":
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			class_is = int(data['class'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []
			for j, word in enumerate(text_block):
				if ('conceal' in word) | ('appearance' in word)  | ('finish' == word) | ('life' in word) | ('life-like' in word) \
					| ('unsight' in word) | ('sightly' in word) | ('beautiful' == word) | ("disguis" in word) | ("sight" == word) \
					| ('beauty' == word) | ('hides' in word) | ('neat' == word) | ('neatness' == word)\
					| ('ugly' == word) | ('ugliness' in word) \
					| ('neater' == word) | ('handsome' == word) | ('tasteful' == word) | ('beautif' in word) | ('resembl' in word)| ('wrinkle' in word) | ('embarrassment' in word)\
						| ('ornamentation' == word):
					if class_is != 623:
						if ("life" in word) | ("sight" == word) | ('wrinkle' in word) | ('concealed' == word) :
							None
						else:
							counts += 1
							# print counts
							if counts > 1:
								new_dict2 = {}
								word_list2 = []
								num_list2 = []
							# print j, word
							scope_neg = j - spread
							scope_neg_diff = spread + scope_neg
							scope_upper = length - j
							if (scope_neg < 0):
								scope = scope_neg_diff
							elif (scope_upper < spread):
								scope = scope_upper
							else:
								scope = spread
							for k in range(scope):
								if counts > 1:
									new_dict2[text_block[j + k]] = k
									new_dict2[text_block[j - k]] = -k
								else:
									new_dict[text_block[j + k]] = k
									new_dict[text_block[j - k]] = -k
							if counts > 1:
								new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
								word_list2.append(' '.join([l[0] for l in new_dict2]))
								num_list2.append([l[1] for l in new_dict2])
								if word_list is str:
									word_list = [word_list]
									word_list.append(word_list2)
								else:
									word_list.append(word_list2[0])
								num_list.append(num_list2[0])

							else:
								new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
								word_list.append(' '.join([l[0] for l in new_dict]))
								num_list.append([l[1] for l in new_dict])
					else:
						counts += 1
						# print counts
						if counts > 1:
							new_dict2 = {}
							word_list2 = []
							num_list2 = []
						# print j, word
						scope_neg = j - spread
						scope_neg_diff = spread + scope_neg
						scope_upper = length - j
						if (scope_neg < 0):
							scope = scope_neg_diff
						elif (scope_upper < spread):
							scope = scope_upper
						else:
							scope = spread
						for k in range(scope):
							if counts > 1:
								new_dict2[text_block[j + k]] = k
								new_dict2[text_block[j - k]] = -k
							else:
								new_dict[text_block[j + k]] = k
								new_dict[text_block[j - k]] = -k
						if counts > 1:
							new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
							word_list2.append(' '.join([l[0] for l in new_dict2]))
							num_list2.append([l[1] for l in new_dict2])
							if word_list is str:
								word_list = [word_list]
								word_list.append(word_list2)
							else:
								word_list.append(word_list2[0])
							num_list.append(num_list2[0])

						else:
							new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
							word_list.append(' '.join([l[0] for l in new_dict]))
							num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])

			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat


	elif char == "cost":
		# spread = 1 is best for cost
		# data['basic'] = 0
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []
			for j, word in enumerate(text_block):
				if ('cheap' in word) | ('expens' in word) | ('inexpen' in word) \
				| ('cost' in word) | ('econom' in word):
					# data.loc[i,'basic'] = 1
					counts += 1
					# print counts
					if counts > 1:
						new_dict2 = {}
						word_list2 = []
						num_list2 = []
					# print j, word
					scope_neg = j - spread
					scope_neg_diff = spread + scope_neg
					scope_upper = length - j
					if (scope_neg < 0):
						scope = scope_neg_diff
					elif (scope_upper < spread):
						scope = scope_upper
					else:
						scope = spread
					for k in range(scope):
						if counts > 1:
							new_dict2[text_block[j + k]] = k
							new_dict2[text_block[j - k]] = -k
						else:
							new_dict[text_block[j + k]] = k
							new_dict[text_block[j - k]] = -k
					if counts > 1:
						new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
						word_list2.append(' '.join([l[0] for l in new_dict2]))
						num_list2.append([l[1] for l in new_dict2])
						if word_list is str:
							word_list = [word_list]
							word_list.append(word_list2)
						else:
							word_list.append(word_list2[0])
						num_list.append(num_list2[0])

					else:
						new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
						word_list.append(' '.join([l[0] for l in new_dict]))
						num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])
			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat

	elif char == "materials":
		# spread = 1 is best for materials
		# data['basic'] = 0
		for i in range(len(data.patnum.values)):
			text_block = simple_preprocess(data['text'].values[i])
			length = len(text_block)
			counts = 0
			new_dict = {}
			word_list = []
			num_list = []
			for j, word in enumerate(text_block):
				if ('substances' == word) | ('materials' == word) | ("compounds" == word) | ("compositions" == word)\
			| ('vulcanized' == word) | ('duralumin' == word) | ('celluloid' == word) | ('laminated' == word) | ('polymer' == word)\
				| ('certalmid' == word) | ('vulcanite' == word) | ('filaments' == word) | ('resisting' == word):
					# data.loc[i,'basic'] = 1
					counts += 1
					# print counts
					if counts > 1:
						new_dict2 = {}
						word_list2 = []
						num_list2 = []
					# print j, word
					scope_neg = j - spread
					scope_neg_diff = spread + scope_neg
					scope_upper = length - j
					if (scope_neg < 0):
						scope = scope_neg_diff
					elif (scope_upper < spread):
						scope = scope_upper
					else:
						scope = spread
					for k in range(scope):
						if counts > 1:
							new_dict2[text_block[j + k]] = k
							new_dict2[text_block[j - k]] = -k
						else:
							new_dict[text_block[j + k]] = k
							new_dict[text_block[j - k]] = -k
					if counts > 1:
						new_dict2 = sorted(new_dict2.items(), key=lambda x: x[1])
						word_list2.append(' '.join([l[0] for l in new_dict2]))
						num_list2.append([l[1] for l in new_dict2])
						if word_list is str:
							word_list = [word_list]
							word_list.append(word_list2)
						else:
							word_list.append(word_list2[0])
						num_list.append(num_list2[0])

					else:
						new_dict = sorted(new_dict.items(), key=operator.itemgetter(1))
						word_list.append(' '.join([l[0] for l in new_dict]))
						num_list.append([l[1] for l in new_dict])

			if not bool(new_dict):
				dict_list.append("None")
				dict_list_repeat.append(["None"])
				num_tot_list.append([[0]])
			else:
				dict_list.append(' '.join(word_list))
				dict_list_repeat.append(word_list)
				num_tot_list.append([num_list])
		return dict_list, num_tot_list, dict_list_repeat

def get_scores(clf, X_train_tfidf, data, test_size_use):
	sens_list = []
	spec_list = []
	balanced_accuracy = []
	kf = StratifiedShuffleSplit(n_splits=200, test_size=test_size_use)

	for train_index, test_index in kf.split(X_train_tfidf, data['category']):
		X_train, X_test = X_train_tfidf[train_index], X_train_tfidf[test_index]
		y_train, y_test = data.loc[train_index, 'category'], data.loc[test_index, 'category']
		result = clf.fit(X_train, y_train)
		pred = result.predict(X_test).round()
		score = (accuracy_score(pred, y_test))
		specificity = len(pred[np.where((pred == 0) & (y_test == 0))]) / float(len(pred[np.where(y_test == 0)]))
		sensitivity = len(pred[np.where((pred == 1) & (y_test == 1))]) / float(len(pred[np.where(y_test == 1)]))
		sens_list.append(sensitivity)
		spec_list.append(specificity)
		balanced_accuracy.append(score)
	
	return np.round(np.mean(sens_list) * 100, decimals = 1), np.round(np.mean(spec_list) * 100, decimals = 1), np.round(np.std(sens_list) * 100, decimals = 1), np.round(np.std(spec_list) * 100, decimals = 1), np.mean(balanced_accuracy), clf

def Domain_Specificity(war, data, char, spread, condition_train_time, condition_train_class, condition_test_class_pros, condition_test_class_o):
	current_directory = os.getcwd()
	# Navigate up one level
	parent_directory = os.path.dirname(current_directory)
	path = os.path.join(parent_directory, "Data", \
		"BothWarsAll_Final.xlsx")
	data = pd.read_excel(path)[["patnum", char, "class"]]
	data['Patnum'] = data['patnum']
	path2 = os.path.join(parent_directory, "Data", \
		"BothWarsText_Final.csv")
	appendon = pd.read_csv(path2)[['Patnum', 'pat_text']]	
	data = data.merge(appendon, on="Patnum")
	data.Patnum = data.Patnum.astype(int)
	data = data[['patnum', char, "pat_text", "class"]]
	data.columns = ['patnum', 'category', 'text', "class"]
	dict_list, num_tot_list, dict_list_repeat = get_new_text(data, spread, char)
	data['new_text'] = dict_list
	count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(data['new_text'])
	tfidf_transformer = TfidfTransformer()
	X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
	# ------CW all vs. WWI pros and other
	clf = GradientBoostingClassifier() 
	X_train_idx, y_train_idx = data.drop(["category"], axis=1)[(condition_train_time) & (condition_train_class)].index.values, data["category"][(condition_train_time) & (condition_train_class)].index.values
	X_test_idx_p, y_test_idx_p = data.drop(["category"], axis=1)[(~condition_train_time) & (condition_test_class_pros)].index.values, data["category"][(~condition_train_time) & (condition_test_class_pros)].index.values
	X_test_idx_o, y_test_idx_o = data.drop(["category"], axis=1)[(~condition_train_time) & (condition_test_class_o)].index.values, data["category"][(~condition_train_time) & (condition_test_class_o)].index.values
	X_test_idx_switch, y_test_idx_switch = data.drop(["category"], axis=1)[(condition_train_time) & (~condition_train_class)].index.values, data["category"][(condition_train_time) & (~condition_train_class)].index.values
	X_train, X_test_p, X_test_o, X_test_switch = X_train_tfidf[X_train_idx], X_train_tfidf[X_test_idx_p], X_train_tfidf[X_test_idx_o],X_train_tfidf[X_test_idx_switch]
	y_train, y_test_p, y_test_o, y_test_switch  = data['category'][y_train_idx], data['category'][y_test_idx_p], data['category'][y_test_idx_o], data['category'][y_test_idx_switch]
	#Make training size the same when predicting other domains as when predicting same domain
	sample_idx_small = data.iloc[X_train_idx].sample(frac = .9).index.values
	sample_idx_big = data.iloc[y_train_idx].sample(frac = .95).index.values
	if char == "materials":
		x_sample = X_train_tfidf[sample_idx_small]
		y_sample = y_train[sample_idx_small]
		result = clf.fit(x_sample, y_sample)
		clf = GradientBoostingClassifier()
		sens, spec, sens_std, spec_std, score_self, model = get_scores(clf, X_train, data.loc[y_train_idx, :].reset_index(drop = True), .1)
	else:
		x_sample = X_train_tfidf[sample_idx_big]
		y_sample = y_train[sample_idx_big]
		result = clf.fit(x_sample, y_sample)
		clf = GradientBoostingClassifier()
		sens, spec, sens_std, spec_std, score_self, model = get_scores(clf, X_train, data.loc[y_train_idx, :].reset_index(drop = True), .05)

	pred = result.predict(X_test_p).round()
	score_CW_all_pros = (balanced_accuracy_score(pred, y_test_p))
	specificity_CW_all_pros = len(pred[np.where((pred == 0) & (y_test_p == 0))]) / float(len(pred[np.where(y_test_p == 0)]))
	sensitivity_CW_all_pros= len(pred[np.where((pred == 1) & (y_test_p == 1))]) / float(len(pred[np.where(y_test_p == 1)]))
	
	pred = result.predict(X_test_o).round()
	score_CW_all_o = (balanced_accuracy_score(pred, y_test_o))
	specificity_CW_all_o = len(pred[np.where((pred == 0) & (y_test_o == 0))]) / float(len(pred[np.where(y_test_o == 0)]))
	sensitivity_CW_all_o= len(pred[np.where((pred == 1) & (y_test_o == 1))]) / float(len(pred[np.where(y_test_o == 1)]))

	pred = result.predict(X_test_switch).round()
	score_CW_all_switch = (balanced_accuracy_score(pred, y_test_switch))
	specificity_CW_all_switch = len(pred[np.where((pred == 0) & (y_test_switch == 0))]) / float(len(pred[np.where(y_test_switch == 0)]))
	sensitivity_CW_all_switch = len(pred[np.where((pred == 1) & (y_test_switch == 1))]) / float(len(pred[np.where(y_test_switch == 1)]))
	
	if war == "CWP":
		return r"&{}& {} & {} & {} & {}\\".format(war, np.round(score_self * 100, decimals = 1), np.round(score_CW_all_switch * 100, decimals = 1), \
				np.round(score_CW_all_pros * 100, decimals = 1), np.round(score_CW_all_o * 100, decimals = 1))
	elif war == "CWC":
		return r"&{}& {} & {} & {} & {}\\".format(war, np.round(score_CW_all_switch * 100, decimals = 1), np.round(score_self * 100, decimals = 1), \
				np.round(score_CW_all_pros * 100, decimals = 1), np.round(score_CW_all_o * 100, decimals = 1))
	elif war == "WWP":
		return r"&{}& {} & {} & {} & {}\\".format(war, np.round(score_CW_all_pros * 100, decimals = 1), np.round(score_CW_all_o * 100, decimals = 1), \
				np.round(score_self * 100, decimals = 1), np.round(score_CW_all_switch * 100, decimals = 1))
	elif war == "WWC":
		return r"&{}& {} & {} & {} & {}\\".format(war, np.round(score_CW_all_pros * 100, decimals = 1), np.round(score_CW_all_o * 100, decimals = 1), \
				np.round(score_CW_all_switch * 100, decimals = 1), np.round(score_self * 100, decimals = 1))
	else:
		raise TypeError("Did not specify war correctly!")

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.04, 1.0, 7)):

	plt.figure()
	plt.autoscale()
	plt.title(title)
	# if ylim is not None:
	#     plt.ylim(*ylim)
	plt.xlabel("Training examples")
	plt.ylabel("Score")
	train_sizes, train_scores, test_scores = learning_curve(
		estimator, X, y, cv=cv, n_jobs=n_jobs, scoring='balanced_accuracy', train_sizes=train_sizes)
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_std = np.std(train_scores, axis=1)
	test_scores_mean = np.mean(test_scores, axis=1)
	test_scores_std_upper = np.quantile(test_scores, .9, axis = 1)
	test_scores_std_lower = np.quantile(test_scores, .1, axis = 1)
	plt.grid()
	# plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
	#                  train_scores_mean + train_scores_std, alpha=0.1,
	#                  color="r")
	plt.fill_between(train_sizes, test_scores_std_lower,
					 test_scores_std_upper, alpha=0.1, color="g")
	# plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
	#          label="Training score")
	plt.plot(train_sizes, test_scores_mean, color="g",
			 label="Cross-validation score")
	plt.ylim(.5,1.0)

	plt.legend(loc="best")
	return plt


current_directory = os.getcwd()
# Navigate up one level
parent_directory = os.path.dirname(current_directory)

path = os.path.join(parent_directory, "Data", \
	"BothWarsAll_Final.xlsx")
data = pd.read_excel(path)
data['Patnum'] = data['patnum']
path2 = os.path.join(parent_directory, "Data", \
	"BothWarsText_Final.csv")
appendon = pd.read_csv(path2)[['Patnum', 'pat_text']]
data = data.merge(appendon, on="Patnum")
data.Patnum = data.Patnum.astype(int)

# -----------------Code generates figure B.3:
char_list = ['simplicity']
spreads = [1]
for i,char in enumerate(char_list):
	path = os.path.join(parent_directory, "Data", \
		"BothWarsAll_Final.xlsx")
	data = pd.read_excel(path)[["patnum", char, "class"]]
	data['Patnum'] = data['patnum']
	path2 = os.path.join(parent_directory, "Data", \
		"BothWarsText_Final.csv")
	appendon = pd.read_csv(path2)[['Patnum', 'pat_text']]
	data = data.merge(appendon, on="Patnum")
	data.Patnum = data.Patnum.astype(int)
	data = data[['patnum', char, "pat_text", "class"]]
	data.columns = ['patnum', 'category', 'text', "class"]
	dict_list, num_tot_list, dict_list_repeat = get_new_text(data, spreads[i], char)
	data['new_text'] = dict_list
	data['proximity'] = num_tot_list
	data['proximity_text'] = dict_list_repeat
	count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(data['new_text'])
	tfidf_transformer = TfidfTransformer()
	X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
	clf = GradientBoostingClassifier()
	param_dist = {"learning_rate": [1, 0.5, 0.25, 0.1, 0.05, 0.01],
				  "n_estimators": sp_randint(1, 200),
				  "max_depth": np.linspace(1, 32, 32, endpoint=True),
				  'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),
				  "max_features": list(range(1,X_train_tfidf.shape[1])),
				  "min_samples_split" : np.linspace(0.1, 1.0, 10, endpoint=True),
				  "loss" : ['deviance', 'exponential']}
	# # run randomized search
	n_iter_search = 60
	random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
									   n_iter=n_iter_search, cv=10, iid=False, scoring="balanced_accuracy", n_jobs=-1)
	random_search.fit(X_train_tfidf, data['category'])
	if char == "simplicity":
		estimator = GradientBoostingClassifier()  #appearance
	else:
		estimator = GradientBoostingClassifier(**random_search.best_params_)  #appearance
	spread = spreads[i]
	title = "Learning Curves (GBM)"
	cv = StratifiedShuffleSplit(n_splits=400, test_size=0.09)	
	plot_learning_curve(estimator, title, X_train_tfidf, data['category'], cv = cv,n_jobs=-1)
	path_save = os.path.join(parent_directory, "Figures", 'Learning_Curve_' + char + '_latest.png')
	# figure B.3:
	print("Figure B.3 Output:")
	plt.savefig(path_save)
	plt.show()

# ----------Table B.1 -- Domain Specificity (numbers may be different due to random seed)-----------
char_list = ["comfort", "simplicity"]
spreads = [2,2]
condition_CW = data.patnum < 500000
condition_WWI = data.patnum > 500000
condition_pros = data['class'] == 623
condition_o = data['class'] != 623
print("Table B.1 Output:")

for i, char in enumerate(char_list):
    print("Table B.1 Panel: ", char)
    print(Domain_Specificity('CWP', data, char, spreads[i], condition_CW, condition_pros, condition_pros, condition_o))
    print(Domain_Specificity('CWC', data, char, spreads[i], condition_CW, condition_o, condition_pros, condition_o))
    print(Domain_Specificity('WWP', data, char, spreads[i], condition_WWI, condition_pros, condition_pros, condition_o))
    print(Domain_Specificity('WWC', data, char, spreads[i], condition_WWI, condition_o, condition_pros, condition_o))

# -----------------Table B.2 (numbers may be different due to random seed):
char_list = ["adjustability", "comfort", "simplicity", "materials", "appearance", "cost"]
spreads = [1,2,2,3,3,1]
print("Begin Table B.2 Output:")
for i, char in enumerate(char_list):
	path = os.path.join(parent_directory, "Data", \
		"BothWarsAll_Final.xlsx")
	data = pd.read_excel(path)[["patnum", char, "class"]]
	data['Patnum'] = data['patnum']
	path2 = os.path.join(parent_directory, "Data", \
		"BothWarsText_Final.csv")
	appendon = pd.read_csv(path2)[['Patnum', 'pat_text']]
	data = data.merge(appendon, on="Patnum")
	data.Patnum = data.Patnum.astype(int)
	data = data[['patnum', char, "pat_text", "class"]]
	data.columns = ['patnum', 'category', 'text', "class"]
	dict_list, num_tot_list, dict_list_repeat = get_new_text(data, spreads[i], char)
	data['new_text'] = dict_list
	data['proximity'] = num_tot_list
	data['proximity_text'] = dict_list_repeat
	count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(data['new_text'])
	tfidf_transformer = TfidfTransformer()
	X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
	clf = GradientBoostingClassifier()
	param_dist = {"learning_rate": np.linspace(.01, 1, 10, endpoint=True),
			  "n_estimators": sp_randint(1, 200),
			  "max_depth": np.linspace(1, 32, 32, endpoint=True),
			  'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),
			  "max_features": ['auto', 'sqrt', 'log2', None],
			  "min_samples_split" : np.linspace(0.1, 1.0, 10, endpoint=True),
			  "loss" : ['deviance', 'exponential']}
	n_iter_search = 50
	random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
									   n_iter=n_iter_search, cv=10, iid=False, scoring="balanced_accuracy", n_jobs=-1)
	random_search.fit(X_train_tfidf, data['category'])

	clf = GradientBoostingClassifier(**random_search.best_params_)
	sens, spec, sens_std, spec_std, acc, model = get_scores(clf, X_train_tfidf, data, .09)
	print(r"{} & {} & {} & {}\\".format(char, sens, spec, acc))
	print(r"&({})&({}) & \\".format(sens_std, spec_std))



